{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/translated-MNLI/resolve/main/translated-mnli-train.jsonl\n",
    "# !wget https://huggingface.co/datasets/mesolitica/translated-MNLI/resolve/main/translated-mnli-dev_matched.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/translated-MNLI/resolve/main/anli.translated.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/translated-indonli/raw/main/train.jsonl\n",
    "# !wget https://huggingface.co/datasets/mesolitica/translated-indonli/raw/main/validation.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4137843164565038"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "from glob import glob\n",
    "import json\n",
    "import random\n",
    "\n",
    "random.random()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['contradiction', 'entailment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"annotator_labels\": [\"neutral\"], \"genre\": \"government\", \"gold_label\": \"neutral\", \"pairID\": \"31193n\", \"promptID\": \"31193\", \"sentence1\": \"Conceptually cream skimming has two basic dimensions - product and geography.\", \"sentence1_binary_parse\": \"( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )\", \"sentence1_parse\": \"(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))\", \"sentence2\": \"Product and geography are what make cream skimming work. \", \"sentence2_binary_parse\": \"( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )\", \"sentence2_parse\": \"(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))\", \"translate\": [\"Skim krim konseptual mempunyai dua dimensi asas - produk dan geografi.\", \"Produk dan geografi adalah apa yang membuat krim skimming berfungsi.\"]}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 1 translated-mnli-train.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"uid\": \"2093cfb3-a15f-4282-81e3-0cb793ffd0d7\", \"premise\": \"TOKYO, Dec 18 (Reuters) - Japan\\u2019s Shionogi & Co said on Tuesday that it has applied to health regulators in the United States, Canada and Europe for approval of its HIV drug Dolutegravir. Shionogi developed Dolutegravir with a Viiv Healthcare, an AIDS drug joint venture between GlaxoSmithKline and Pfizer, in exchange for its rights to the drug.\", \"hypothesis\": \"The article was written on December 18th.\", \"label\": 0, \"reason\": \"TOKYO, Dec 18 (Reuters) is when the article was written as it states in the first words of the sentence\", \"premise_ms\": \"TOKYO, 18 Dis (Reuters) - Shionogi & Co Jepun berkata pada hari Selasa bahawa ia telah memohon kepada pengawal selia kesihatan di Amerika Syarikat, Kanada dan Eropah untuk kelulusan ubat HIVnya Dolutegravir. Shionogi membangunkan Dolutegravir dengan Viiv Healthcare, sebuah usaha sama ubat AIDS antara GlaxoSmithKline dan Pfizer, sebagai pertukaran untuk haknya terhadap ubat tersebut.\", \"hypothesis_ms\": \"Artikel itu ditulis pada 18 Disember.\"}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 1 anli.translated.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "anli = {\n",
    "    0: 1,\n",
    "    2: 0,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open('train.jsonl') as fopen:\n",
    "    for l in fopen:\n",
    "        data = json.loads(l)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'premise': 'Presiden Joko Widodo (Jokowi) menyampaikan prediksi bahwa wabah virus Corona (COVID-19) di Indonesia akan selesai akhir tahun ini.',\n",
       " 'hypothesis': 'Prediksi akhir wabah tidak disampaikan Jokowi.',\n",
       " 'label': 2,\n",
       " 'premise_ms': 'Presiden Joko Widodo (Jokowi) menyampaikan prediksi bahwa wabah virus Corona (COVID-19) di Indonesia akan selesai akhir tahun ini.',\n",
       " 'hypothesis_ms': '\"Ramalan akhir wabak tidak dikemukakan oleh Jokowi.\"'}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10330it [00:00, 287687.40it/s]\n",
      "100459it [00:00, 161176.28it/s]\n",
      "392702it [00:02, 143846.66it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'w') as fopen_jsonl:\n",
    "    with open('train.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            label = data['label']\n",
    "            label = anli.get(label)\n",
    "            if label is None:\n",
    "                continue\n",
    "            \n",
    "            sent1 = data['premise_ms'].strip()\n",
    "            sent2 = data['hypothesis_ms'].strip()\n",
    "            \n",
    "            left = f'ayat1: {sent1} ayat2: {sent2}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "    with open('anli.translated.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            label = data['label']\n",
    "            label = anli.get(label)\n",
    "            if label is None:\n",
    "                continue\n",
    "                \n",
    "            sent1 = data['premise'].strip()\n",
    "            sent2 = data['hypothesis'].strip()\n",
    "            \n",
    "            sent1_ms = data['premise_ms'].strip()\n",
    "            sent2_ms = data['hypothesis_ms'].strip()\n",
    "            \n",
    "            left = f'ayat1: {sent1} ayat2: {sent2}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            left = f'ayat1: {sent1_ms} ayat2: {sent2_ms}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1} ayat2: {sent2_ms}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1_ms} ayat2: {sent2}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "    with open('translated-mnli-train.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            label = data['gold_label']\n",
    "            if label not in labels:\n",
    "                continue\n",
    "            if label == '-':\n",
    "                continue\n",
    "            \n",
    "            label = labels.index(label)\n",
    "            sent1 = data['sentence1'].strip()\n",
    "            sent2 = data['sentence2'].strip()\n",
    "            \n",
    "            sent1_ms = data['translate'][0].strip()\n",
    "            sent2_ms = data['translate'][1].strip()\n",
    "            \n",
    "            left = f'ayat1: {sent1} ayat2: {sent2}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            left = f'ayat1: {sent1_ms} ayat2: {sent2_ms}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1} ayat2: {sent2_ms}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1_ms} ayat2: {sent2}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "    \n",
    "    with open('sentiment.json') as fopen:\n",
    "        data = json.load(fopen)\n",
    "        for i in range(len(data['X1'])):\n",
    "            l = data['X1'][i]\n",
    "            r = data['X2'][i]\n",
    "            left = f'ayat1: {l} ayat2: {r}'\n",
    "            d = {\"src\": left, \"label\": data['Y'][i]}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "    with open('topics.json') as fopen:\n",
    "        data = json.load(fopen)\n",
    "        for i in range(len(data['X1'])):\n",
    "            l = data['X1'][i]\n",
    "            r = data['X2'][i]\n",
    "            left = f'ayat1: {l} ayat2: {r}'\n",
    "            d = {\"src\": left, \"label\": data['Y'][i]}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "    with open('positive-toxicity.json') as fopen:\n",
    "        data = json.load(fopen)\n",
    "        for i in range(len(data['X1'])):\n",
    "            l = data['X1'][i]\n",
    "            r = data['X2'][i]\n",
    "            left = f'ayat1: {l} ayat2: {r}'\n",
    "            d = {\"src\": left, \"label\": data['Y'][i]}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "    with open('negative-toxicity.json') as fopen:\n",
    "        data = json.load(fopen)\n",
    "        for i in range(len(data['X1'])):\n",
    "            l = data['X1'][i]\n",
    "            r = data['X2'][i]\n",
    "            left = f'ayat1: {l} ayat2: {r}'\n",
    "            d = {\"src\": left, \"label\": data['Y'][i]}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1093561 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train.json > shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "10000it [00:00, 123856.58it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('test.json', 'w') as fopen_jsonl:\n",
    "    with open('translated-mnli-dev_matched.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            label = data['gold_label']\n",
    "            if label not in labels:\n",
    "                continue\n",
    "            if label == '-':\n",
    "                continue\n",
    "            \n",
    "            label = labels.index(label)\n",
    "            sent1 = data['sentence1'].strip()\n",
    "            sent2 = data['sentence2'].strip()\n",
    "            \n",
    "            sent1_ms = data['translate'][0].strip()\n",
    "            sent2_ms = data['translate'][1].strip()\n",
    "            \n",
    "            left = f'ayat1: {sent1} ayat2: {sent2}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            left = f'ayat1: {sent1_ms} ayat2: {sent2_ms}'\n",
    "            d = {\"src\": left, \"label\": label}\n",
    "            fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1} ayat2: {sent2_ms}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "            if random.random() > 0.8:\n",
    "            \n",
    "                left = f'ayat1: {sent1_ms} ayat2: {sent2}'\n",
    "                d = {\"src\": left, \"label\": label}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf test.json > shuffled-test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -n 1000 shuffled-test.json > test-1k.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16037 shuffled-test.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l shuffled-test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"src\": \"ayat1: Tidak ada Tuhan di sini sekarang, kata suara raksasa di hadapan mereka. ayat2: A monster spoke to them.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: Ini adalah ketepuan dan 125 keping urutan berjalan Enhanced Carrier Route mel volume pada tahun 1996. ayat2: Laluan Pengangkut Enhanced diperkenalkan sebelum tahun 1997.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: Untuk mencapai mana-mana tiga Carbet jatuh, anda mesti terus berjalan selepas jalan-jalan berakhir selama 20 minit, 30 minit, atau dua jam masing-masing. ayat2: Terdapat tiga laluan ke tiga Carbet jatuh, masing-masing panjang yang berbeza dan semuanya berterusan selepas jalan seolah-olah berakhir.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: Dia mempunyai dua fikiran, satu gembira dalam kedamaian kampung ini. ayat2: Kampung ini penuh dengan keganasan.\", \"label\": 0}\r\n",
      "{\"src\": \"ayat1: but uh i've always enjoyed uh the train and you know fooling with it and all ayat2: Saya selalu suka kereta api dan bermain-main dengannya.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: Hampir langsung di atas, terdapat tempat sewa di mana ketiadaan warna atau ciri yang aneh menunjukkan lubang di kubah di atasnya. ayat2: Terdapat tempat sewa di mana ketiadaan warna yang aneh menunjukkan lubang di atasnya.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: dalam satu pengertian um um i'm an older person in my fifties jadi saya rasa bahawa kita telah kehilangan beberapa perkara dalam arti bahawa wanita harus bekerja hari ini ayat2: As someone from an older generation, I feel like things have changed.\", \"label\": 1}\r\n",
      "{\"src\": \"ayat1: Jon replaced Susan's cloak with a white robe and a head scarf, also quite dirty. ayat2: Jon replaced her cloak with a red robe and a clean arm scarf.\", \"label\": 0}\r\n",
      "{\"src\": \"ayat1: Tidak kisahlah bahawa filem itu telah keluar selama berbulan-bulan dan pencalonan Pelakon Pembantu Terbaik Oscar telah dianugerahkan untuk penggambaran watak wanita. ayat2: Pelakon meletup\", \"label\": 0}\r\n",
      "{\"src\": \"ayat1: Cara teknikal untuk ketiga-tiga strategi ini akan diringkaskan kemudian dalam kertas ini. ayat2: Terdapat tujuh strategi yang dibincangkan dalam kertas kerja.\", \"label\": 0}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 10 shuffled-test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
